https://www.kaggle.com/datasets/hemanthhari/symptoms-and-covid-presence?resource=download
https://www.kaggle.com/code/midouazerty/symptoms-covid-19-using-7-machine-learning-98 https://www.kaggle.com/code/meesalasaidhanush/symptoms-and-covid-presence-99-acc https://www.kaggle.com/code/dzuljalali/covid-19-classification-using-svm-svc
# import the Libraries
import numpy as np
import pandas as pd
# visualisasi data
import matplotlib
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
# dataprep
from dataprep.eda import *
from dataprep.eda.missing import plot_missing
from dataprep.eda import plot_correlation
# splitting the dataset into train set and test set
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import accuracy_score,confusion_matrix
# LDA
## feature scaling
from sklearn.preprocessing import StandardScaler
## import LDA model
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
## import the Logistic Regression model from sklearn using the 2 variances with the help of LDA
from sklearn.linear_model import LogisticRegression
# import the dataset
df = pd.read_csv('Covid Dataset.csv')
df.head(5)
| Breathing Problem | Fever | Dry Cough | Sore throat | Running Nose | Asthma | Chronic Lung Disease | Headache | Heart Disease | Diabetes | ... | Fatigue | Gastrointestinal | Abroad travel | Contact with COVID Patient | Attended Large Gathering | Visited Public Exposed Places | Family working in Public Exposed Places | Wearing Masks | Sanitization from Market | COVID-19 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Yes | Yes | Yes | Yes | Yes | No | No | No | No | Yes | ... | Yes | Yes | No | Yes | No | Yes | Yes | No | No | Yes |
| 1 | Yes | Yes | Yes | Yes | No | Yes | Yes | Yes | No | No | ... | Yes | No | No | No | Yes | Yes | No | No | No | Yes |
| 2 | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | No | Yes | ... | Yes | Yes | Yes | No | No | No | No | No | No | Yes |
| 3 | Yes | Yes | Yes | No | No | Yes | No | No | Yes | Yes | ... | No | No | Yes | No | Yes | Yes | No | No | No | Yes |
| 4 | Yes | Yes | Yes | Yes | Yes | No | Yes | Yes | Yes | Yes | ... | No | Yes | No | Yes | No | Yes | No | No | No | Yes |
5 rows × 21 columns
df.describe()
| Breathing Problem | Fever | Dry Cough | Sore throat | Running Nose | Asthma | Chronic Lung Disease | Headache | Heart Disease | Diabetes | ... | Fatigue | Gastrointestinal | Abroad travel | Contact with COVID Patient | Attended Large Gathering | Visited Public Exposed Places | Family working in Public Exposed Places | Wearing Masks | Sanitization from Market | COVID-19 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 5434 | 5434 | 5434 | 5434 | 5434 | 5434 | 5434 | 5434 | 5434 | 5434 | ... | 5434 | 5434 | 5434 | 5434 | 5434 | 5434 | 5434 | 5434 | 5434 | 5434 |
| unique | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | ... | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 1 | 1 | 2 |
| top | Yes | Yes | Yes | Yes | Yes | No | No | Yes | No | No | ... | Yes | No | No | Yes | No | Yes | No | No | No | Yes |
| freq | 3620 | 4273 | 4307 | 3953 | 2952 | 2920 | 2869 | 2736 | 2911 | 2846 | ... | 2821 | 2883 | 2983 | 2726 | 2924 | 2820 | 3172 | 5434 | 5434 | 4383 |
4 rows × 21 columns
df.columns
Index(['Breathing Problem', 'Fever', 'Dry Cough', 'Sore throat',
'Running Nose', 'Asthma', 'Chronic Lung Disease', 'Headache',
'Heart Disease', 'Diabetes', 'Hyper Tension', 'Fatigue ',
'Gastrointestinal ', 'Abroad travel', 'Contact with COVID Patient',
'Attended Large Gathering', 'Visited Public Exposed Places',
'Family working in Public Exposed Places', 'Wearing Masks',
'Sanitization from Market', 'COVID-19'],
dtype='object')
plot_missing(df)
0%| | 0/230 [00:00<?, ?it/s]
C:\Users\hp\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\dask\core.py:119: RuntimeWarning: invalid value encountered in divide return func(*(_execute_task(a, cache) for a in args))
| Missing Cells | 0 |
|---|---|
| Missing Cells (%) | 0.0% |
| Missing Columns | 0 |
| Missing Rows | 0 |
| Avg Missing Cells per Column | 0.0 |
| Avg Missing Cells per Row | 0.0 |
# create a table with data missing
missing_values=df.isnull().sum() # missing values
percent_missing = df.isnull().sum()/df.shape[0]*100 # missing value %
value = {
'missing_values ':missing_values,
'percent_missing %':percent_missing
}
frame=pd.DataFrame(value)
frame
| missing_values | percent_missing % | |
|---|---|---|
| Breathing Problem | 0 | 0.0 |
| Fever | 0 | 0.0 |
| Dry Cough | 0 | 0.0 |
| Sore throat | 0 | 0.0 |
| Running Nose | 0 | 0.0 |
| Asthma | 0 | 0.0 |
| Chronic Lung Disease | 0 | 0.0 |
| Headache | 0 | 0.0 |
| Heart Disease | 0 | 0.0 |
| Diabetes | 0 | 0.0 |
| Hyper Tension | 0 | 0.0 |
| Fatigue | 0 | 0.0 |
| Gastrointestinal | 0 | 0.0 |
| Abroad travel | 0 | 0.0 |
| Contact with COVID Patient | 0 | 0.0 |
| Attended Large Gathering | 0 | 0.0 |
| Visited Public Exposed Places | 0 | 0.0 |
| Family working in Public Exposed Places | 0 | 0.0 |
| Wearing Masks | 0 | 0.0 |
| Sanitization from Market | 0 | 0.0 |
| COVID-19 | 0 | 0.0 |
sns.countplot(x='COVID-19',data=df)
<AxesSubplot:xlabel='COVID-19', ylabel='count'>
df["COVID-19"].value_counts().plot.pie(explode=[0.1,0.1],autopct='%1.1f%%',shadow=True)
plt.title('number of cases');
sns.countplot(x='Breathing Problem',data=df)
<AxesSubplot:xlabel='Breathing Problem', ylabel='count'>
sns.countplot(x='Breathing Problem',hue='COVID-19',data=df)
<AxesSubplot:xlabel='Breathing Problem', ylabel='count'>
sns.countplot(x='Fever',hue='COVID-19',data=df);
sns.countplot(x='Dry Cough',hue='COVID-19',data=df)
<AxesSubplot:xlabel='Dry Cough', ylabel='count'>
sns.countplot(x='Sore throat',hue='COVID-19',data=df)
<AxesSubplot:xlabel='Sore throat', ylabel='count'>
from sklearn.preprocessing import LabelEncoder
e=LabelEncoder()
df['Breathing Problem']=e.fit_transform(df['Breathing Problem'])
df['Fever']=e.fit_transform(df['Fever'])
df['Dry Cough']=e.fit_transform(df['Dry Cough'])
df['Sore throat']=e.fit_transform(df['Sore throat'])
df['Running Nose']=e.fit_transform(df['Running Nose'])
df['Asthma']=e.fit_transform(df['Asthma'])
df['Chronic Lung Disease']=e.fit_transform(df['Chronic Lung Disease'])
df['Headache']=e.fit_transform(df['Headache'])
df['Heart Disease']=e.fit_transform(df['Heart Disease'])
df['Diabetes']=e.fit_transform(df['Diabetes'])
df['Hyper Tension']=e.fit_transform(df['Hyper Tension'])
df['Fatigue ']=e.fit_transform(df['Fatigue '])
df['Gastrointestinal ']=e.fit_transform(df['Gastrointestinal '])
df['Abroad travel']=e.fit_transform(df['Abroad travel'])
df['Contact with COVID Patient']=e.fit_transform(df['Contact with COVID Patient'])
df['Attended Large Gathering']=e.fit_transform(df['Attended Large Gathering'])
df['Visited Public Exposed Places']=e.fit_transform(df['Visited Public Exposed Places'])
df['Family working in Public Exposed Places']=e.fit_transform(df['Family working in Public Exposed Places'])
df['Wearing Masks']=e.fit_transform(df['Wearing Masks'])
df['Sanitization from Market']=e.fit_transform(df['Sanitization from Market'])
df['COVID-19']=e.fit_transform(df['COVID-19'])
# print dataset again
df.head(5)
| Breathing Problem | Fever | Dry Cough | Sore throat | Running Nose | Asthma | Chronic Lung Disease | Headache | Heart Disease | Diabetes | ... | Fatigue | Gastrointestinal | Abroad travel | Contact with COVID Patient | Attended Large Gathering | Visited Public Exposed Places | Family working in Public Exposed Places | Wearing Masks | Sanitization from Market | COVID-19 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | ... | 1 | 1 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 1 |
| 1 | 1 | 1 | 1 | 1 | 0 | 1 | 1 | 1 | 0 | 0 | ... | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 1 |
| 2 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 0 | 1 | ... | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 3 | 1 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | ... | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 1 |
| 4 | 1 | 1 | 1 | 1 | 1 | 0 | 1 | 1 | 1 | 1 | ... | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 |
5 rows × 21 columns
df.dtypes.value_counts()
int32 21 dtype: int64
df.describe(include='all')
| Breathing Problem | Fever | Dry Cough | Sore throat | Running Nose | Asthma | Chronic Lung Disease | Headache | Heart Disease | Diabetes | ... | Fatigue | Gastrointestinal | Abroad travel | Contact with COVID Patient | Attended Large Gathering | Visited Public Exposed Places | Family working in Public Exposed Places | Wearing Masks | Sanitization from Market | COVID-19 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 5434.000000 | 5434.000000 | 5434.000000 | 5434.000000 | 5434.000000 | 5434.000000 | 5434.000000 | 5434.000000 | 5434.000000 | 5434.000000 | ... | 5434.000000 | 5434.000000 | 5434.000000 | 5434.000000 | 5434.000000 | 5434.000000 | 5434.000000 | 5434.0 | 5434.0 | 5434.000000 |
| mean | 0.666176 | 0.786345 | 0.792602 | 0.727457 | 0.543246 | 0.462643 | 0.472028 | 0.503497 | 0.464299 | 0.476261 | ... | 0.519139 | 0.469452 | 0.451049 | 0.501656 | 0.461907 | 0.518955 | 0.416268 | 0.0 | 0.0 | 0.806588 |
| std | 0.471621 | 0.409924 | 0.405480 | 0.445309 | 0.498172 | 0.498648 | 0.499263 | 0.500034 | 0.498770 | 0.499482 | ... | 0.499680 | 0.499112 | 0.497644 | 0.500043 | 0.498593 | 0.499687 | 0.492984 | 0.0 | 0.0 | 0.395009 |
| min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.000000 |
| 25% | 0.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.0 | 1.000000 |
| 50% | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | ... | 1.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 1.000000 | 0.000000 | 0.0 | 0.0 | 1.000000 |
| 75% | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | ... | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.0 | 0.0 | 1.000000 |
| max | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | ... | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.0 | 0.0 | 1.000000 |
8 rows × 21 columns
df.hist(figsize=(20,15));
corr=df.corr()
corr.style.background_gradient(cmap='coolwarm',axis=None)
| Breathing Problem | Fever | Dry Cough | Sore throat | Running Nose | Asthma | Chronic Lung Disease | Headache | Heart Disease | Diabetes | Hyper Tension | Fatigue | Gastrointestinal | Abroad travel | Contact with COVID Patient | Attended Large Gathering | Visited Public Exposed Places | Family working in Public Exposed Places | Wearing Masks | Sanitization from Market | COVID-19 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Breathing Problem | 1.000000 | 0.089903 | 0.159562 | 0.303768 | 0.055190 | 0.075318 | -0.098291 | -0.062172 | -0.073366 | 0.055427 | 0.045256 | 0.000561 | -0.075390 | 0.117795 | 0.214634 | 0.200304 | 0.066688 | 0.018295 | nan | nan | 0.443764 |
| Fever | 0.089903 | 1.000000 | 0.127580 | 0.322235 | 0.081758 | 0.073953 | -0.025160 | -0.035416 | -0.031462 | 0.050286 | 0.079001 | -0.060458 | -0.008067 | 0.128726 | 0.164704 | 0.070490 | 0.002252 | 0.012102 | nan | nan | 0.352891 |
| Dry Cough | 0.159562 | 0.127580 | 1.000000 | 0.213907 | -0.030763 | 0.086843 | -0.043664 | -0.035912 | 0.047566 | -0.006593 | 0.081989 | -0.039909 | 0.008251 | 0.331418 | 0.128330 | 0.117963 | 0.086176 | 0.163102 | nan | nan | 0.464292 |
| Sore throat | 0.303768 | 0.322235 | 0.213907 | 1.000000 | 0.039450 | 0.081377 | -0.050440 | -0.015971 | 0.002177 | 0.001938 | 0.042811 | -0.023290 | 0.025886 | 0.205986 | 0.189251 | 0.216438 | 0.079055 | 0.104378 | nan | nan | 0.502848 |
| Running Nose | 0.055190 | 0.081758 | -0.030763 | 0.039450 | 1.000000 | -0.022763 | -0.014376 | 0.068479 | -0.056750 | 0.042961 | -0.020445 | 0.007026 | -0.014673 | 0.034526 | 0.003776 | 0.061099 | 0.032568 | -0.061323 | nan | nan | -0.005657 |
| Asthma | 0.075318 | 0.073953 | 0.086843 | 0.081377 | -0.022763 | 1.000000 | -0.033771 | 0.037064 | 0.076783 | -0.012060 | 0.017707 | 0.006564 | 0.101909 | 0.068286 | 0.005046 | -0.044592 | 0.020941 | -0.115679 | nan | nan | 0.089930 |
| Chronic Lung Disease | -0.098291 | -0.025160 | -0.043664 | -0.050440 | -0.014376 | -0.033771 | 1.000000 | -0.050480 | -0.039860 | 0.046789 | -0.010331 | -0.047655 | -0.050333 | -0.088854 | -0.062482 | -0.020548 | -0.093049 | 0.038343 | nan | nan | -0.056837 |
| Headache | -0.062172 | -0.035416 | -0.035912 | -0.015971 | 0.068479 | 0.037064 | -0.050480 | 1.000000 | 0.048471 | 0.032390 | -0.207489 | 0.052035 | 0.097778 | 0.043589 | -0.082101 | -0.162992 | -0.005790 | -0.012625 | nan | nan | -0.027793 |
| Heart Disease | -0.073366 | -0.031462 | 0.047566 | 0.002177 | -0.056750 | 0.076783 | -0.039860 | 0.048471 | 1.000000 | -0.032956 | 0.049139 | -0.058925 | 0.004121 | -0.020761 | -0.025593 | -0.045437 | 0.086169 | 0.035000 | nan | nan | 0.027072 |
| Diabetes | 0.055427 | 0.050286 | -0.006593 | 0.001938 | 0.042961 | -0.012060 | 0.046789 | 0.032390 | -0.032956 | 1.000000 | 0.042543 | -0.043903 | 0.040651 | 0.039013 | -0.085696 | -0.061650 | -0.078212 | 0.097696 | nan | nan | 0.040627 |
| Hyper Tension | 0.045256 | 0.079001 | 0.081989 | 0.042811 | -0.020445 | 0.017707 | -0.010331 | -0.207489 | 0.049139 | 0.042543 | 1.000000 | -0.027605 | -0.067972 | -0.016382 | 0.027307 | 0.002911 | 0.019174 | 0.048152 | nan | nan | 0.102575 |
| Fatigue | 0.000561 | -0.060458 | -0.039909 | -0.023290 | 0.007026 | 0.006564 | -0.047655 | 0.052035 | -0.058925 | -0.043903 | -0.027605 | 1.000000 | 0.009356 | -0.068401 | -0.027383 | -0.031058 | -0.009562 | -0.025623 | nan | nan | -0.044188 |
| Gastrointestinal | -0.075390 | -0.008067 | 0.008251 | 0.025886 | -0.014673 | 0.101909 | -0.050333 | 0.097778 | 0.004121 | 0.040651 | -0.067972 | 0.009356 | 1.000000 | 0.099577 | 0.025277 | -0.017251 | -0.061885 | -0.027603 | nan | nan | -0.003367 |
| Abroad travel | 0.117795 | 0.128726 | 0.331418 | 0.205986 | 0.034526 | 0.068286 | -0.088854 | 0.043589 | -0.020761 | 0.039013 | -0.016382 | -0.068401 | 0.099577 | 1.000000 | 0.080210 | 0.113399 | 0.069609 | 0.143094 | nan | nan | 0.443875 |
| Contact with COVID Patient | 0.214634 | 0.164704 | 0.128330 | 0.189251 | 0.003776 | 0.005046 | -0.062482 | -0.082101 | -0.025593 | -0.085696 | 0.027307 | -0.027383 | 0.025277 | 0.080210 | 1.000000 | 0.234649 | 0.079800 | 0.006909 | nan | nan | 0.357122 |
| Attended Large Gathering | 0.200304 | 0.070490 | 0.117963 | 0.216438 | 0.061099 | -0.044592 | -0.020548 | -0.162992 | -0.045437 | -0.061650 | 0.002911 | -0.031058 | -0.017251 | 0.113399 | 0.234649 | 1.000000 | 0.083795 | 0.063776 | nan | nan | 0.390145 |
| Visited Public Exposed Places | 0.066688 | 0.002252 | 0.086176 | 0.079055 | 0.032568 | 0.020941 | -0.093049 | -0.005790 | 0.086169 | -0.078212 | 0.019174 | -0.009562 | -0.061885 | 0.069609 | 0.079800 | 0.083795 | 1.000000 | 0.028486 | nan | nan | 0.119755 |
| Family working in Public Exposed Places | 0.018295 | 0.012102 | 0.163102 | 0.104378 | -0.061323 | -0.115679 | 0.038343 | -0.012625 | 0.035000 | 0.097696 | 0.048152 | -0.025623 | -0.027603 | 0.143094 | 0.006909 | 0.063776 | 0.028486 | 1.000000 | nan | nan | 0.160208 |
| Wearing Masks | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan |
| Sanitization from Market | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan |
| COVID-19 | 0.443764 | 0.352891 | 0.464292 | 0.502848 | -0.005657 | 0.089930 | -0.056837 | -0.027793 | 0.027072 | 0.040627 | 0.102575 | -0.044188 | -0.003367 | 0.443875 | 0.357122 | 0.390145 | 0.119755 | 0.160208 | nan | nan | 1.000000 |
# beberapa data sama sekali tidak berkolerasi contohnya
# Running Nose / Asthma /Chronic Lung Disease / Headache / Heart Disease / Diabetes / Fatigue / Gastrointestinal / Wearing Masks / Sanitization from Market
# maka kita akan melakukan drop kepada data - data tersebut
df=df.drop('Running Nose',axis=1)
df=df.drop('Chronic Lung Disease',axis=1)
df=df.drop('Headache',axis=1)
df=df.drop('Heart Disease',axis=1)
df=df.drop('Diabetes',axis=1)
df=df.drop('Gastrointestinal ',axis=1)
df=df.drop('Wearing Masks',axis=1)
df=df.drop('Sanitization from Market',axis=1)
df=df.drop('Asthma',axis=1)
df=df.drop('Fatigue ',axis=1)
df.columns
Index(['Breathing Problem', 'Fever', 'Dry Cough', 'Sore throat',
'Hyper Tension', 'Abroad travel', 'Contact with COVID Patient',
'Attended Large Gathering', 'Visited Public Exposed Places',
'Family working in Public Exposed Places', 'COVID-19'],
dtype='object')
df.dtypes.value_counts()
int32 11 dtype: int64
corr=df.corr()
corr.style.background_gradient(cmap='coolwarm',axis=None)
| Breathing Problem | Fever | Dry Cough | Sore throat | Hyper Tension | Abroad travel | Contact with COVID Patient | Attended Large Gathering | Visited Public Exposed Places | Family working in Public Exposed Places | COVID-19 | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| Breathing Problem | 1.000000 | 0.089903 | 0.159562 | 0.303768 | 0.045256 | 0.117795 | 0.214634 | 0.200304 | 0.066688 | 0.018295 | 0.443764 |
| Fever | 0.089903 | 1.000000 | 0.127580 | 0.322235 | 0.079001 | 0.128726 | 0.164704 | 0.070490 | 0.002252 | 0.012102 | 0.352891 |
| Dry Cough | 0.159562 | 0.127580 | 1.000000 | 0.213907 | 0.081989 | 0.331418 | 0.128330 | 0.117963 | 0.086176 | 0.163102 | 0.464292 |
| Sore throat | 0.303768 | 0.322235 | 0.213907 | 1.000000 | 0.042811 | 0.205986 | 0.189251 | 0.216438 | 0.079055 | 0.104378 | 0.502848 |
| Hyper Tension | 0.045256 | 0.079001 | 0.081989 | 0.042811 | 1.000000 | -0.016382 | 0.027307 | 0.002911 | 0.019174 | 0.048152 | 0.102575 |
| Abroad travel | 0.117795 | 0.128726 | 0.331418 | 0.205986 | -0.016382 | 1.000000 | 0.080210 | 0.113399 | 0.069609 | 0.143094 | 0.443875 |
| Contact with COVID Patient | 0.214634 | 0.164704 | 0.128330 | 0.189251 | 0.027307 | 0.080210 | 1.000000 | 0.234649 | 0.079800 | 0.006909 | 0.357122 |
| Attended Large Gathering | 0.200304 | 0.070490 | 0.117963 | 0.216438 | 0.002911 | 0.113399 | 0.234649 | 1.000000 | 0.083795 | 0.063776 | 0.390145 |
| Visited Public Exposed Places | 0.066688 | 0.002252 | 0.086176 | 0.079055 | 0.019174 | 0.069609 | 0.079800 | 0.083795 | 1.000000 | 0.028486 | 0.119755 |
| Family working in Public Exposed Places | 0.018295 | 0.012102 | 0.163102 | 0.104378 | 0.048152 | 0.143094 | 0.006909 | 0.063776 | 0.028486 | 1.000000 | 0.160208 |
| COVID-19 | 0.443764 | 0.352891 | 0.464292 | 0.502848 | 0.102575 | 0.443875 | 0.357122 | 0.390145 | 0.119755 | 0.160208 | 1.000000 |
X=df.drop('COVID-19',axis=1)
y=df['COVID-19']
# splitting the dataset into train set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,
random_state = 0)
# because our data is 0 and 1, feature scaling is not neeeded
# LDA
lda = LDA()
lda.fit(X_train, y_train)
X_train_lda = lda.transform(X_train)
X_test_lda = lda.transform(X_test)
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
# fitting the model
model.fit(X_train_lda, y_train)
y_pred = model.predict(X_test_lda)
#Score/Accuracy
acc_logreg=model.score(X_test_lda, y_test)*100
acc_logreg
94.84820607175713
from sklearn.metrics import confusion_matrix
# create the confusion matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
<AxesSubplot:>
# import the accuracy score
from sklearn.metrics import accuracy_score
# print the accuracy score
print(accuracy_score(y_test, y_pred))
0.9484820607175714
# import the classification report
from sklearn.metrics import classification_report
# print the classification report
print(classification_report(y_test, y_pred))
precision recall f1-score support
0 0.89 0.81 0.85 192
1 0.96 0.98 0.97 895
accuracy 0.95 1087
macro avg 0.93 0.89 0.91 1087
weighted avg 0.95 0.95 0.95 1087
# use predict the result based on input. with the process of LDA
def predict_result():
# input the data
Breathing_Problem = int(input('Breathing Problem: '))
Fever = int(input('Fever: '))
Dry_Cough = int(input('Dry Cough: '))
Sore_throat = int(input('Sore throat: '))
Hyper_Tension = int(input('Hyper Tension: '))
Abroad_travel = int(input('Abroad travel: '))
Contact_with_COVID_Patient = int(input('Contact with COVID Patient: '))
Attended_Large_Gathering = int(input('Attended Large Gathering: '))
Visited_Public_Exposed_Places = int(input('Visited Public Exposed Places: '))
Family_working_in_Public_Exposed_Places = int(input('Family working in Public Exposed Places: '))
# create the array
data = np.array([[Breathing_Problem, Fever, Dry_Cough, Sore_throat, Hyper_Tension, Abroad_travel, Contact_with_COVID_Patient, Attended_Large_Gathering, Visited_Public_Exposed_Places, Family_working_in_Public_Exposed_Places]])
# transform the data
data = lda.transform(data)
# predict the result
result = model.predict(data)
# print the result
if result == 0:
print('Not Infected')
else:
print('Infected')
# call the function
predict_result()
Not Infected
C:\Users\hp\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\base.py:450: UserWarning: X does not have valid feature names, but LinearDiscriminantAnalysis was fitted with feature names warnings.warn(
!jupyter nbconvert --to html "./LDA-Project" --output-dir="./"
Markdown basics https://markdown-guide.readthedocs.io/en/latest/basics.html#